Pre-process data from multipe datasets to get data max temp stats over CONUS

import cartopy.crs as ccrs
import intake
import uxarray as ux
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from matplotlib.colors import LinearSegmentedColormap
import numpy as np
import pandas as pd
# rda_url           = 'https://data.rda.ucar.edu/'
# era5_catalog      = rda_url + 'pythia_era5_24/pythia_intake_catalogs/era5_catalog.json'
era5_catalog   =  '/glade/campaign/collections/rda/data/d850001/catalogs/posix/era5/era5_catalog_posix.json'

Compare datasets based on availability in the HEALPix format

Dataset

Variable

Long Name

Units

Frequency available

Comments

ERA5

2t or ‘VAR_2T’

temps

K

hourly

Reanalysis

| CONUS | tas | Surface air emps | K | Hourly | WRF output|

# Define the period of interest
start, end = '2020-01-01', '2020-12-31'
cat_url = "https://digital-earths-global-hackathon.github.io/catalog/catalog.yaml"
cat = intake.open_catalog(cat_url).NCAR
cat
NCAR:
  args:
    path: https://digital-earths-global-hackathon.github.io/catalog/NCAR/catalog.yaml
  description: catalog as visible from NCAR
  driver: intake.catalog.local.YAMLFileCatalog
  metadata:
    catalog_dir: https://digital-earths-global-hackathon.github.io/catalog
list(cat)
['CERES_EBAF',
 'ERA5',
 'IR_IMERG',
 'JRA3Q',
 'MERRA2',
 'arp-gem-1p3km',
 'arp-gem-2p6km',
 'casesm2_10km_nocumulus',
 'icon_d3hp003',
 'icon_d3hp003aug',
 'icon_d3hp003feb',
 'icon_ngc4008',
 'ifs_tco3999-ng5_deepoff',
 'ifs_tco3999-ng5_rcbmf',
 'ifs_tco3999-ng5_rcbmf_cf',
 'ifs_tco3999_rcbmf',
 'mpas_dyamond1',
 'mpas_dyamond2',
 'mpas_dyamond3',
 'nicam_220m_test',
 'nicam_gl11',
 'scream-dkrz',
 'scream2D_hrly',
 'scream_lnd',
 'scream_ne120',
 'tracking-d3hp003',
 'um_Africa_km4p4_RAL3P3_n1280_GAL9_nest',
 'um_CTC_km4p4_RAL3P3_n1280_GAL9_nest',
 'um_SAmer_km4p4_RAL3P3_n1280_GAL9_nest',
 'um_SEA_km4p4_RAL3P3_n1280_GAL9_nest',
 'um_glm_n1280_CoMA9_TBv1p2',
 'um_glm_n1280_GAL9',
 'um_glm_n2560_RAL3p3',
 'wrf_conus',
 'wrf_samerica']

Get max temp data from different models and obs

Obtain data

## Seasonal max precip over CONUS in 2020 ?

WRF -South America data

zoom_level = 9
%%time
ds_wrf = cat.wrf_conus(zoom=zoom_level).to_dask()
da_wrf = ds_wrf.surface_temperature
da_wrf = da_wrf.sel(Time= slice(start,end))
da_wrf
CPU times: user 3.33 s, sys: 302 ms, total: 3.63 s
Wall time: 4.94 s
/glade/u/apps/opt/conda/envs/2025-digital-earths-global-hackathon/lib/python3.12/site-packages/intake_xarray/base.py:21: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.
  'dims': dict(self._ds.dims),
<xarray.DataArray 'surface_temperature' (Time: 8784, cell: 193604)> Size: 7GB
dask.array<getitem, shape=(8784, 193604), dtype=float32, chunksize=(48, 65536), chunktype=numpy.ndarray>
Coordinates:
  * Time     (Time) datetime64[ns] 70kB 2020-01-01 ... 2020-12-31T23:00:00
  * cell     (cell) int64 2MB 540117 540118 540119 ... 2097149 2097150 2097151
    crs      float32 4B nan
Attributes:
    description:       Air temperature at the lowest model level
    domain_extent:     (-138.73135, -57.068634, 17.647308, 57.34342)
    grid_mapping:      healpix_nested
    healpix_nside:     512
    quantization:      quantization_info
    quantization_nsd:  5
    regrid_method:     easygems_delaunay
    stagger:           
    units:             K
# np.set_printoptions(threshold=np.inf)

ERA5 data

cat_era5 = intake.open_esm_datastore(era5_catalog)
# cat_era5.df[['variable','long_name']].values
/glade/u/apps/opt/conda/envs/2025-digital-earths-global-hackathon/lib/python3.12/site-packages/intake_esm/cat.py:249: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv(
# cat_era5.df
cat_temp_era5 = cat_era5.search(variable='VAR_2T',year=2020)
cat_temp_era5.df
Unnamed: 0 era_id datatype level_type step_type table_code param_code variable long_name units year month format frequency path
0 551912 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 1 nc hourly /glade/campaign/collections/rda/data/d633000/e...
1 551974 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 2 nc hourly /glade/campaign/collections/rda/data/d633000/e...
2 552036 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 3 nc hourly /glade/campaign/collections/rda/data/d633000/e...
3 552098 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 4 nc hourly /glade/campaign/collections/rda/data/d633000/e...
4 552160 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 5 nc hourly /glade/campaign/collections/rda/data/d633000/e...
5 552222 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 6 nc hourly /glade/campaign/collections/rda/data/d633000/e...
6 552284 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 7 nc hourly /glade/campaign/collections/rda/data/d633000/e...
7 552346 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 8 nc hourly /glade/campaign/collections/rda/data/d633000/e...
8 552408 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 9 nc hourly /glade/campaign/collections/rda/data/d633000/e...
9 552470 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 10 nc hourly /glade/campaign/collections/rda/data/d633000/e...
10 552532 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 11 nc hourly /glade/campaign/collections/rda/data/d633000/e...
11 552594 e5 an NaN sfc 128 167 VAR_2T 2 metre temperature K 2020 12 nc hourly /glade/campaign/collections/rda/data/d633000/e...
%%time
dsets_era5 = cat_temp_era5.to_dataset_dict()
--> The keys in the returned dictionary of datasets are constructed as follows:
	'datatype.step_type'
100.00% [1/1 00:01<00:00]
CPU times: user 1.19 s, sys: 104 ms, total: 1.29 s
Wall time: 1.22 s
dsets_era5.keys()
dict_keys(['an.sfc'])
da_era5 = dsets_era5['an.sfc']
da_era5 = da_era5.VAR_2T
da_era5['latitude'] = da_era5['latitude'].assign_attrs(standard_name='latitude') 
da_era5['longitude'] = da_era5['longitude'].assign_attrs(standard_name='longitude') 
print(da_era5['latitude'].attrs,da_era5['longitude'].attrs)
{'long_name': 'latitude', 'short_name': 'lat', 'units': 'degrees_north', 'standard_name': 'latitude'} {'long_name': 'longitude', 'short_name': 'lon', 'units': 'degrees_east', 'standard_name': 'longitude'}

Convert to HEALpix

ux_era5     = ux.UxDataset.from_structured(da_era5.to_dataset(name='T'))
ux_wrf      = ux.UxDataset.from_healpix(da_wrf.to_dataset(name='T'))
ux_wrf      = ux_wrf.rename({'Time':'time'})
# ux_wrf
ux_wrf
<xarray.UxDataset> Size: 143GB
Dimensions:  (time: 184104, n_face: 193604)
Coordinates:
  * time     (time) datetime64[ns] 1MB 2000-01-01 ... 2020-12-31T23:00:00
    cell     (n_face) int64 2MB 540117 540118 540119 ... 2097149 2097150 2097151
    crs      float32 4B nan
Dimensions without coordinates: n_face
Data variables:
    T        (time, n_face) float32 143GB dask.array<chunksize=(48, 65536), meta=np.ndarray>
# 2. Pull out the UxDataArray for each variable
da_era5_ux    = ux_era5['T']
da_wrf_ux     = ux_wrf['T']
da_wrf_ux
<xarray.UxDataArray 'T' (time: 184104, n_face: 193604)> Size: 143GB
dask.array<concatenate, shape=(184104, 193604), dtype=float32, chunksize=(48, 65536), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) datetime64[ns] 1MB 2000-01-01 ... 2020-12-31T23:00:00
    cell     (n_face) int64 2MB 540117 540118 540119 ... 2097149 2097150 2097151
    crs      float32 4B nan
Dimensions without coordinates: n_face
Attributes:
    description:       Air temperature at the lowest model level
    domain_extent:     (-138.73135, -57.068634, 17.647308, 57.34342)
    grid_mapping:      healpix_nested
    healpix_nside:     512
    quantization:      quantization_info
    quantization_nsd:  5
    regrid_method:     easygems_delaunay
    stagger:           
    units:             K

Subset to CONUS

da_wrf_ux
<xarray.UxDataArray 'T' (time: 184104, n_face: 193604)> Size: 143GB
dask.array<concatenate, shape=(184104, 193604), dtype=float32, chunksize=(48, 65536), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) datetime64[ns] 1MB 2000-01-01 ... 2020-12-31T23:00:00
    cell     (n_face) int64 2MB 540117 540118 540119 ... 2097149 2097150 2097151
    crs      float32 4B nan
Dimensions without coordinates: n_face
Attributes:
    description:       Air temperature at the lowest model level
    domain_extent:     (-138.73135, -57.068634, 17.647308, 57.34342)
    grid_mapping:      healpix_nested
    healpix_nside:     512
    quantization:      quantization_info
    quantization_nsd:  5
    regrid_method:     easygems_delaunay
    stagger:           
    units:             K
# Define CONUS (Contiguous United States) bounding box
lon_bounds = (-125.0, -66.5)  # (min_lon, max_lon)
lat_bounds = (17.65, 50.0)     # (min_lat, max_lat)
%%time
conus_era5    = da_era5_ux.subset.bounding_box(lon_bounds, lat_bounds)   
CPU times: user 122 ms, sys: 0 ns, total: 122 ms
Wall time: 121 ms
%%time
conus_wrf     = da_wrf_ux.subset.bounding_box(lon_bounds, lat_bounds)   
conus_wrf
CPU times: user 33.6 ms, sys: 0 ns, total: 33.6 ms
Wall time: 32.6 ms
<xarray.UxDataArray 'T' (time: 184104, n_face: 1713)> Size: 1GB
dask.array<getitem, shape=(184104, 1713), dtype=float32, chunksize=(48, 1713), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) datetime64[ns] 1MB 2000-01-01 ... 2020-12-31T23:00:00
    cell     (n_face) int64 14kB 554239 554255 554259 ... 597591 597592 597593
    crs      float32 4B nan
Dimensions without coordinates: n_face
Attributes:
    description:       Air temperature at the lowest model level
    domain_extent:     (-138.73135, -57.068634, 17.647308, 57.34342)
    grid_mapping:      healpix_nested
    healpix_nside:     512
    quantization:      quantization_info
    quantization_nsd:  5
    regrid_method:     easygems_delaunay
    stagger:           
    units:             K
print(conus_era5.attrs.get("units", "no units attribute"))
K
print(conus_wrf.attrs.get("units", "no units attribute"))
K

Compute seasonal max temperatures

def seasonal_max_uxarray(da, season="DJF"):
    """
    Compute seasonal max for a given season from a uxarray.DataArray,
    without adding a new coordinate.

    Parameters:
    -----------
    da : uxarray.DataArray
        Input DataArray on an unstructured mesh with a time coordinate.
    season : str
        One of ["DJF", "MAM", "JJA", "SON"]
    time_coord : str
        Name of the time coordinate in the DataArray.

    Returns:
    --------
    xr.DataArray
        Seasonal maximum temperature.
    """
    # Ensure time is datetime64
    if not np.issubdtype(da["time"].dtype, np.datetime64):
        da["time"] = xr.decode_cf(da).time

    # Use boolean indexing with dt accessor
    season_mask = da["time"].dt.season == season
    return da.sel(time=season_mask).max(dim="time")
def seasonal_mean_uxarray(da, season="DJF"):
    """
    Compute seasonal mean for a given season from a uxarray.DataArray,
    without adding a new coordinate.

    Parameters:
    -----------
    da : uxarray.DataArray
        Input DataArray on an unstructured mesh with a time coordinate.
    season : str
        One of ["DJF", "MAM", "JJA", "SON"]
    time_coord : str
        Name of the time coordinate in the DataArray.

    Returns:
    --------
    xr.DataArray
        Seasonal maximum temperature.
    """
    # Ensure time is datetime64
    if not np.issubdtype(da["time"].dtype, np.datetime64):
        da["time"] = xr.decode_cf(da).time

    # Use boolean indexing with dt accessor
    season_mask = da["time"].dt.season == season
    return da.sel(time=season_mask).mean(dim="time")
%%time
conus_wrf_jja_max = seasonal_max_uxarray(conus_wrf, 'JJA')
conus_era5_jja_max = seasonal_max_uxarray(conus_era5, 'JJA')
conus_wrf_jja_max
CPU times: user 107 ms, sys: 7.8 ms, total: 115 ms
Wall time: 114 ms
<xarray.UxDataArray 'T' (n_face: 1713)> Size: 7kB
dask.array<_nanmax_skip-aggregate, shape=(1713,), dtype=float32, chunksize=(1713,), chunktype=numpy.ndarray>
Coordinates:
    cell     (n_face) int64 14kB 554239 554255 554259 ... 597591 597592 597593
    crs      float32 4B nan
Dimensions without coordinates: n_face
# np.set_printoptions(threshold=np.inf)
%%time
conus_wrf_jja_mean = seasonal_mean_uxarray(conus_wrf, 'JJA')
conus_era5_jja_mean = seasonal_mean_uxarray(conus_era5, 'JJA')
# conus_wrf_jja_mean
CPU times: user 114 ms, sys: 0 ns, total: 114 ms
Wall time: 112 ms
# Seasonal max - Seasonal mean
conus_wrf_jja_anomaly = conus_wrf_jja_max - conus_wrf_jja_mean
conus_era5_jja_anomaly = conus_era5_jja_max - conus_era5_jja_mean
# %%time
# conus_era5_jja_max.to_zarr('conus_era5_jja_max.zarr',mode='w')
CPU times: user 20 s, sys: 420 ms, total: 20.4 s
Wall time: 34.2 s
<xarray.backends.zarr.ZarrStore at 0x14bb85788dc0>
# %%time
# conus_wrf_jja_max.to_zarr('conus_era5_jja_max.zarr',mode='w')
# conus_era5_jja_max = xr.open_zarr(conus_era5_jja_max.zarr)
# conus_era5_jja_max 
%%time
# conus bounds
# lon_min, lon_max = -125.0, -66.5
# lat_min, lat_max = 24.0, 50.0
#
conus_era5_jja_anomaly.plot(projection=ccrs.PlateCarree(),features=['borders','coastline'],cmap='Reds') + \
conus_wrf_jja_anomaly.plot(projection=ccrs.PlateCarree(),features=['borders','coastline'],cmap='Reds')
CPU times: user 1min 19s, sys: 4.38 s, total: 1min 23s
Wall time: 49.5 s